

# Estimates hierarchical random effects conjoint model.

# WARNING: this will take a LONG time to run. ALSO, if you want to use your computer
# while this code runs in the background, don't use all cores. Change 
# options(mc.cores = ...) on line 29 to desired number of cores. But in order to 
# replicate our results *exactly* you may need to use 8 cores. 


library(readstata13)
library(dplyr)
library(rstan)
library(MASS)
library(ggpubr)
library(Hmisc)

# some helper functions for plotting
if(!require(wpmarble)){
  devtools::install_github("wpmarble/wpmarble")
  require(wpmarble)
}


rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
chains = options()$mc.cores 



select <- dplyr::select

# Is this a test? If so, only keep n_test respondents.
test = FALSE
n_test = 1200
set.seed(7121992)


# Load and Clean Data -----------------------------------------------------

origdat = read.dta13("Data/conjoint_for_r.dta")

# create indicators for conjoint levels
levs = c("educ", "hieduc", "invest", "gov", "workers", "local")
for (l in levs){
  
}

# create variable for income quartile (within MSA sample) and race
origdat = origdat %>% 
  group_by(msa) %>% 
  mutate(inc_quartile = cut(as.numeric(income), 
                            breaks = wtd.quantile(as.numeric(income), c(0, .25, .5, .75, 1), na.rm=TRUE, weights = weight), 
                            include.lowest = TRUE, 
                            labels = c("Income: First quartile", "Income: Second quartile",
                                       "Income: Third quartile", "Income: Fourth quartile")),
         race2 = ifelse(white == 1, "White", ifelse(black == 1, "Black", ifelse(latino == 1, "Latino", "Other Non-White")))) %>% 
  ungroup() 

# create party variables
origdat = origdat %>% 
  rename(party_id_nolean = party_id,
         pid7 = party_strong_leaning) %>% 
  mutate(dem_lean = as.numeric(pid7 %in% c(1, 2, 3)),
         rep_lean = as.numeric(pid7 %in% c(5, 6, 7)),
         ind_lean = as.numeric(pid7 == 4))
origdat = origdat %>% 
  mutate(pid7_2 = factor(pid7, labels = c("Strong Dem.", "Weak Dem.", "Lean Dem.", "Independent", "Lean Rep.", "Weak Rep.", "Strong Rep.")))


# create other misc variables
origdat = origdat %>% 
  mutate(homeowner = as.numeric(own_where_living == "Own"),
       looking_for_work = as.numeric(empl_status %in% c("Unemployed, looking for work", "Employed part time, looking for additional work")),
       years_in_msa2 = cut(years_in_msa, c(0, 1, 5, 10, 15, 100), include.lowest = TRUE, labels = c("Less than a year", "1-5 years", "6-10 years", "11-15 years", "16 or more years"))) %>% 
  mutate(party_id_lean = ifelse(dem_lean == 1, "Democrat", ifelse(rep_lean == 1, "Republican", "Independent"))) 






# Subset Data for Testing?  -----------------------------------------------


if (test){
  keepresp = sample(unique(origdat$caseid), n_test)
  dat = origdat %>% filter(caseid %in% keepresp)
} else {
  dat = origdat
}



# Create components for Stan ----------------------------------------------

# Which individual-level covariates to include? 
cov.list = c("pid7_2", "age3150", "age5165", "agegt65", "black", "latino", 
             "other", "female", "college", "inc_quartile", "looking_for_work", 
             "homeowner", "years_in_msa2", "msa")

# Find cases where there are NA's in covariates
na_cases = apply(dat[, cov.list], 1, function(x) any(is.na(x)))
na_caseids = unique(dat$caseid[na_cases])


# Drop cases w/ NA's and order the data frame by respondent
dat = dat %>% 
  filter(!caseid %in% na_caseids) %>% 
  mutate(i = as.numeric(factor(caseid))) %>% 
  arrange(i)
ind_dat = dat %>% distinct(caseid, .keep_all = TRUE) %>% arrange(i)


# Individual-level covariates
rhs_form = as.formula(paste0("~ ", paste(cov.list, collapse = " + ")))
ind_covariates = model.matrix(rhs_form, data = ind_dat)
ind_covariates_w_variation = apply(ind_covariates, 2, function(x) sd(x) != 0)
ind_covariates = cbind(1, ind_covariates[, ind_covariates_w_variation])
colnames(ind_covariates)[1] = "Intercept"
k_indvl = ncol(ind_covariates)

# Outcome vector
y = dat$plan_chosen
n = length(y)

# Table to look up the respondent 
i_lookup = select(dat, caseid, i)
n_resp = length(unique(i_lookup$i))

# Matrices containing the factor levels
educ_mat = model.matrix(~ educ_ind_1 + educ_ind_2 + educ_ind_3 + educ_ind_4 - 1, data = dat)
k_educ = ncol(educ_mat)
hieduc_mat = model.matrix(~ hieduc_ind_1 + hieduc_ind_2 + hieduc_ind_3 + hieduc_ind_4 - 1, data = dat)
k_hieduc = ncol(hieduc_mat)
invest_mat = model.matrix(~ invest_ind_1 + invest_ind_2 + invest_ind_3 - 1, data = dat)
k_invest = ncol(invest_mat)
gov_mat = model.matrix(~ gov_ind_1 + gov_ind_2 - 1, data = dat)
k_gov = ncol(gov_mat)
workers_mat = model.matrix(~ workers_ind_1 + workers_ind_2 + workers_ind_3 + workers_ind_4 - 1, data = dat)
k_workers = ncol(workers_mat)
local_mat = model.matrix(~ local_ind_1 + local_ind_2 + local_ind_3 - 1, data = dat)
k_local = ncol(local_mat)



# Put data into a list
standat = list(
  y = y, 
  n = n, 
  n_resp = n_resp,
  ii = i_lookup$i,
  educ_mat = educ_mat,
  k_educ = k_educ,
  hieduc_mat = hieduc_mat,
  k_hieduc = k_hieduc,
  invest_mat = invest_mat,
  k_invest = k_invest,
  gov_mat = gov_mat,
  k_gov = k_gov,
  workers_mat = workers_mat,
  k_workers = k_workers,
  local_mat = local_mat,
  k_local = k_local,
  k_indvl = k_indvl,
  ind_covariates = ind_covariates
)


# Set Starting Values -----------------------------------------------------

# Function to generate random starting values, generated as draws from 
# the prior. 
get_start_vals = function() {
  
  # Use OLS to get a sense of the mean of the beta's
  educ_start    = coef(lm(y ~ educ_mat))[-1]
  hieduc_start  = coef(lm(y ~ hieduc_mat))[-1]
  invest_start  = coef(lm(y ~ invest_mat))[-1]
  gov_start     = coef(lm(y ~ gov_mat))[-1]
  workers_start = coef(lm(y ~ workers_mat))[-1]
  local_start   = coef(lm(y ~ local_mat))[-1]
  
  beta_educ_start    = t(replicate(n = n_resp, mvrnorm(n = 1, mu = educ_start, Sigma = diag(5, nrow = k_educ))))
  beta_hieduc_start  = t(replicate(n = n_resp, mvrnorm(n = 1, mu = hieduc_start, Sigma = diag(5, nrow = k_hieduc))))
  beta_invest_start  = t(replicate(n = n_resp, mvrnorm(n = 1, mu = invest_start, Sigma = diag(5, nrow = k_invest))))
  beta_gov_start     = t(replicate(n = n_resp, mvrnorm(n = 1, mu = gov_start, Sigma = diag(5, nrow = k_gov))))
  beta_workers_start = t(replicate(n = n_resp, mvrnorm(n = 1, mu = workers_start, Sigma = diag(5, nrow = k_workers))))
  beta_local_start   = t(replicate(n = n_resp, mvrnorm(n = 1, mu = local_start, Sigma = diag(5, nrow = k_local))))
  
  intercept_start = rnorm(n = n_resp, mean = .5, sd = .25)
  
  # get random starting values for gammas.
  gamma_educ_start       = replicate(k_indvl, runif(k_educ, min = -2, max = 2) )
  gamma_hieduc_start     = replicate(k_indvl, runif(k_hieduc, min = -2, max = 2) )
  gamma_invest_start     = replicate(k_indvl, runif(k_invest, min = -2, max = 2) )
  gamma_gov_start        = replicate(k_indvl, runif(k_gov, min = -2, max = 2))
  gamma_workers_start    = replicate(k_indvl, runif(k_workers, min = -2, max = 2))
  gamma_local_start      = replicate(k_indvl, runif(k_local, min = -2, max = 2))
  gamma_intercept_start  = runif(k_indvl, min = -2, max = 2)
  
  # Random starting values for the building blocks of the covariance matrices
  educ_sd_start     = abs(rcauchy(k_educ, location = 0, scale = 2))
  hieduc_sd_start   = abs(rcauchy(k_hieduc, location = 0, scale = 2))
  invest_sd_start   = abs(rcauchy(k_invest, location = 0, scale = 2))
  gov_sd_start      = abs(rcauchy(k_gov, location = 0, scale = 2))
  workers_sd_start  = abs(rcauchy(k_workers, location = 0, scale = 2))
  local_sd_start    = abs(rcauchy(k_local, location = 0, scale = 2))
  
  
  start_vals = list(
    gamma_educ = gamma_educ_start,
    gamma_hieduc = gamma_hieduc_start,
    gamma_invest = gamma_invest_start,
    gamma_gov = gamma_gov_start,
    gamma_workers = gamma_workers_start,
    gamma_local = gamma_local_start,
    gamma_intercept = gamma_intercept_start,
    beta_educ_tilde = beta_educ_start,
    beta_hieduc_tilde = beta_hieduc_start,
    beta_invest_tilde = beta_invest_start,
    beta_gov_tilde = beta_gov_start,
    beta_workers_tilde = beta_workers_start,
    beta_local_tilde = beta_local_start,
    intercept_tilde = intercept_start,
    intercept_sd = abs(rcauchy(1, location = 0, scale = 2)),
    residual_sd = abs(rcauchy(1, location = 0, scale = 2)),
    educ_sd = educ_sd_start,
    hieduc_sd = hieduc_sd_start,
    invest_sd = invest_sd_start,
    gov_sd = gov_sd_start,
    workers_sd = workers_sd_start,
    local_sd = local_sd_start
  )
  return(start_vals)
}










# Compile and Estimate Model ----------------------------------------------


# Compile Stan code
mod = stan_model(file = "Code/hlm/stancode/raneff_ind_covs_indpt_slopes_ncp.stan")



cat("Now starting MCMC sampling...\n\n")
t0 = Sys.time()
iter = 1200  # how many iterations for each chain? 
sampout = sampling(mod, data = standat, init = get_start_vals,
                   chains = chains, iter = iter, 
                   pars = c("beta_educ_tilde", "beta_hieduc_tilde", "beta_invest_tilde", 
                            "beta_gov_tilde", "beta_workers_tilde", "beta_local_tilde",
                            "intercept_tilde"),
                   include = FALSE
)
t1 = Sys.time()
cat("Sampling", iter, "iterations each on", chains, "chains, with N =", n, ". It took", round(t1 - t0, 2), units(t1-t0), "\n\n")
time_elapsed = t1 - t0
# save(sampout, ind_dat, standat, i_lookup, time_elapsed, file = paste0("data/stan_output/demos_socioecon_pid7_sampsize_", n_test, ".rdata"))
save(sampout, ind_dat, standat, i_lookup, time_elapsed, file = "Data/demos_socioecon_pid7_full_sample.rdata")


## Save parameter summary separately using summary.stanfit()
summ = lapply(sampout@sim$pars_oi, function(x) as.data.frame(summary(sampout, pars = x)$summary))
summ = lapply(summ, function(x){ x$parname <- rownames(x); return(x)})
summ = bind_rows(lapply(summ, as.data.frame))
saveRDS(summ, "Data/summary_socioecon_pid7.rds")



